library(rvest)
library(xml2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(leaflet)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
url <- "https://www.tomaticket.es/index.php?accion=search&buscador=&IdLugar=0&IdTag=4"
tomaticket <- read_html(url) |> xml_child()
df_theaters <- data.frame(theater = theaters)
df_shows <- data.frame(show_links)
clean_dates <- sub("^[^=]+=[^=]+=(.*)$", "\\1", datetime)
clean_dates <- sub(".*\"(\\d{4}-\\d{2}-\\d{2}).*", "\\1", clean_dates)
dates <- as.Date(clean_dates)
df_date <- data.frame(dates)
df_date$id <- c(1:100)
df_theaters$theater_name <- sub(".*/recintos/(.*)$", "\\1", df_theaters$theater)
df_theaters <- df_theaters %>%
select(-theater)
df_theaters$id <- c(1:100)
final_df <- left_join(df_theaters, df_date, by = "id")
df_shows$show <- sub(".*/entradas-(.*)", "\\1", df_shows$show_links)
df_shows <- df_shows %>%
select(-show_links)
df_shows$id <- c(1:100)
final_df <- left_join(final_df, df_shows, by = "id")
location_address <- grep('address', location, value = TRUE)
clean_location <- sub('.*="([^"]*)".*', '\\1', location_address)
df_location <- data.frame(clean_location)
df_location$id <- c(1:100)
final_df <- left_join(final_df, df_location, by = "id")
clean_prices <- sub('.*="([^"]*)".*', '\\1', only_prices)
df_prices <- data.frame(clean_prices)
df_prices$id <- c(1:100)
df_prices$clean_prices <- gsub(",", ".", df_prices$clean_prices)
df_prices$clean_prices <- as.numeric(df_prices$clean_prices)
df_prices$clean_prices <- round(df_prices$clean_prices, 2)
final_df <- left_join(final_df, df_prices, by = "id")
final_df$show <- gsub("-", " ", final_df$show)
final_df$theater_name <- gsub("-", " ", final_df$theater_name)
current_date <- as.Date("2024-03-22", format = "%Y-%m-%d")
current_date <- rep(current_date, times = 100)
df_current_date <- data.frame(current_date)
df_current_date$id <- c(1:100)
final_df <- left_join(final_df, df_current_date, by = "id")
final_df <- final_df %>%
select(id, everything())
final_df <- final_df %>%
rename(location = clean_location, prices = clean_prices)
final_df <- final_df %>%
relocate(dates, .after = prices)
final_df <- final_df %>%
mutate(days_elapsed = difftime(current_date, dates))
final_df <- left_join(final_df, df_1, by = "id")
final_df$days_elapsed <- as.numeric(final_df$days_elapsed, units = "days")
#write.csv(final_df, "final_df.csv", row.names = FALSE) # I have saved df_final as a database in case Tomaticket removes any links that might prevent us from retrieving that observation again
final_df <- read_csv("final_df.csv")
## Rows: 100 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): theater_name, show, location
## dbl (5): id, prices, days_elapsed, Latitud, Longitud
## date (2): dates, current_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#I remove observations 93 and 98 because the theater does not correspond to the show (in Tomaticket)
final_df <- final_df %>%
slice(-93, -98)
final_df
## # A tibble: 98 × 10
## id theater_name show location prices dates current_date days_elapsed
## <dbl> <chr> <chr> <chr> <dbl> <date> <date> <dbl>
## 1 1 teatros luc… dobl… Madrid 18 2021-05-07 2024-03-22 1050
## 2 2 teatre cond… yo s… Barcelo… 17 2022-06-17 2024-03-22 644
## 3 3 teatro enca… magi… Madrid 15 2022-08-27 2024-03-22 573
## 4 4 pequeno tea… cort… Madrid 20 2022-08-31 2024-03-22 569
## 5 5 off latina impo… Madrid 6.5 2022-09-03 2024-03-22 566
## 6 6 teatro arle… la m… Madrid 12 2022-09-26 2024-03-22 543
## 7 7 gran teatro… davi… Madrid 20 2022-10-08 2024-03-22 531
## 8 8 teatro arle… la a… Madrid 14 2022-10-12 2024-03-22 527
## 9 9 teatro capi… alex… Madrid 16 2022-10-22 2024-03-22 517
## 10 10 teatro figa… una … Madrid 16 2023-01-01 2024-03-22 446
## # ℹ 88 more rows
## # ℹ 2 more variables: Latitud <dbl>, Longitud <dbl>
ggplot(data = final_df, aes(x = reorder(location, location, length))) +
geom_bar(aes(fill = location)) +
geom_text(stat='count', aes(label=..count..), vjust=-0.5) +
labs(title = "Number of shows per city",
x = "City",
y = "Number of shows") +
theme_minimal()
At Tomaticket, there are 98 theater productions available, of which 82 are in theaters in Madrid, 13 in Barcelona, and one in Granada, Málaga, and Valladolid.
ggplot(final_df, aes(x = days_elapsed)) +
geom_density(fill = "skyblue", color = "navyblue") +
labs(title = "Distribution of duration (available days)", x = "Days", y = "Density") +
theme_minimal()
#Mean and mode
media <- mean(final_df$days_elapsed)
moda <- names(sort(-table(final_df$days_elapsed)))[1]
media
## [1] 255.3469
moda
## [1] "182"
We can observe that a large part of the theater productions are available in theaters for less than 250 days. On average, the shows advertised on Tomaticket are available in mean in each theater for about 256 days; however, the usual or typical availability is around 182 days (6 months).
avg_price_per_city <- final_df %>%
group_by(location) %>%
summarise(avg_price = mean(prices, na.rm = TRUE))
ggplot(avg_price_per_city, aes(x = reorder(location, avg_price), y = avg_price, fill = avg_price)) +
geom_bar(stat = "identity") +
geom_text(aes(label = sprintf("€%.2f", avg_price)), vjust = -0.5, color = "black", size = 3) + # Añadir etiquetas de texto con los precios
labs(title = "Average Price by Location",
x = "Location",
y = "Average price") +
scale_fill_gradient(low = "lightgreen", high = "orange") + # Escala de color de menos a más precio
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Valladolid has the highest average price (€22) for the theater productions available on Tomaticket, followed by Barcelona (€15.53) and Madrid (€14.15). On the other hand, the cities with the cheapest theater productions are Granada and Málaga, with prices below €10.50.
However, for Valladolid, Granada, and Málaga, we only have one theater production available. This means that the average price for those cities will be equal to the price of that single observation.
In practical terms, this may result in a biased representation of the average price for those cities, as the price of a single observation may not be representative of the price distribution in that particular city. Therefore, it is important to consider this bias when interpreting the results of the analysis.
Therefore, we will focus our analysis on the theater productions advertised on Tomaticket only for Madrid, and we will compare between theaters.
data_madrid <- final_df %>%
filter(location == "Madrid")
precio_medio_por_teatro <- data_madrid %>%
group_by(theater_name) %>%
summarize(precio_medio = mean(prices))
ggplot(precio_medio_por_teatro, aes(x = reorder(theater_name, precio_medio), y = precio_medio)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(title = "Average Price per Theater",
x = "Theater",
y = "Average Price") +
theme(axis.text.x = element_text(angle = 40, size = 6, hjust = 1))
Let´s see the most expensive and the cheapest theaters in Madrid
data_madrid_tp <- data_madrid %>%
group_by(theater_name) %>%
summarise(avg_price = round(mean(prices, na.rm = TRUE), 2)) # 27 theaters
Most expensive theaters:
data_madrid_tp %>% slice_max(avg_price, n = 6)
## # A tibble: 6 × 2
## theater_name avg_price
## <chr> <dbl>
## 1 gran teatro caixabank principe pio madrid 21
## 2 teatro marquina 18.2
## 3 taberna flamenca el cortijo madrid 18
## 4 teatro alcazar madrid 17.5
## 5 teatro reina victoria madrid 17.5
## 6 pequeno teatro gran via madrid 16.5
Cheapest theaters:
data_madrid_tp %>% slice_min(avg_price, n = 6)
## # A tibble: 6 × 2
## theater_name avg_price
## <chr> <dbl>
## 1 iIntruso bar madrid 5
## 2 wit comedy club madrid 7
## 3 meltdown madrid 8
## 4 teatro bellas artes madrid 8
## 5 off latina 8.64
## 6 SOJO Laboratorio Teatral 11.0
obras_por_teatro <- data_madrid %>%
group_by(theater_name) %>%
summarize(num_obras = n())
ggplot(obras_por_teatro, aes(x = reorder(theater_name, num_obras), y = num_obras, fill = num_obras)) +
geom_bar(stat = "identity") +
scale_fill_gradient(low = "lightblue", high = "darkblue") +
labs(title = "Número de Obras por Teatro",
x = "Teatro",
y = "Número de Obras") +
coord_flip() +
geom_text(aes(label = num_obras), hjust = -0.2, size = 3)
“Teatro Luchana” is the theater that hosts the most shows in Madrid (17), followed by “Teatro Lara” (10), “Teatro Arlequín” (9), and “Teatro Off Latina” (7).
largest_shows_madrid <- data_madrid %>%
top_n(5, days_elapsed)
largest_shows_madrid
## # A tibble: 5 × 10
## id theater_name show location prices dates current_date days_elapsed
## <dbl> <chr> <chr> <chr> <dbl> <date> <date> <dbl>
## 1 1 teatros luch… dobl… Madrid 18 2021-05-07 2024-03-22 1050
## 2 3 teatro encan… magi… Madrid 15 2022-08-27 2024-03-22 573
## 3 4 pequeno teat… cort… Madrid 20 2022-08-31 2024-03-22 569
## 4 5 off latina impo… Madrid 6.5 2022-09-03 2024-03-22 566
## 5 6 teatro arleq… la m… Madrid 12 2022-09-26 2024-03-22 543
## # ℹ 2 more variables: Latitud <dbl>, Longitud <dbl>
ggplot(data = largest_shows_madrid,
aes(x = reorder(show, -days_elapsed),
y = days_elapsed)) +
geom_bar(stat = "identity", fill = "blue") +
labs(x = "Show", y = "Duración (días desde su fecha hasta ahora)", title = "Duración de los 5 shows más largos") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
The show ‘Doble o Nada’ stands out significantly above the rest, with a run time of over 1000 days. The other shows closely following it have an approximate duration of 500 days each.
plot_ly(data = data_madrid, x = ~days_elapsed, y = ~prices, text = ~paste("Nombre del show: ", show, "<br>Duración: ", current_date - dates, " días<br>Precio: $", prices), hoverinfo = "text",
type = 'scatter', mode = 'markers', marker = list(color = ~prices, colorscale = 'RdYlGn',
colorbar = list(title = "Precio", ticksuffix = "$"))) %>%
layout(title = "Relación entre Días Activos del Show y Precio Medio",
xaxis = list(title = "Días Activos del Show"),
yaxis = list(title = "Precio Medio"))
In shows priced at €10 or higher, there is a certain positive relationship observed between the number of days the show is active and its price. Although it is not a very strong relationship.
teatros_con_mas_shows <- data_madrid %>%
group_by(theater_name) %>%
summarize(num_obras = n(),
prices = mean(prices),
days_elapsed = mean(days_elapsed)) %>%
arrange(desc(num_obras))
data_clustering <- teatros_con_mas_shows %>%
select(theater_name, prices, num_obras, days_elapsed)
#normalization
data_clustering_norm <- data_clustering %>%
select(-theater_name) %>%
scale()
#K.means clustering
set.seed(123)
kmeans_model <- kmeans(data_clustering_norm, centers = 3)
teatros_con_mas_shows$cluster <- as.factor(kmeans_model$cluster)
grafico <- ggplot(teatros_con_mas_shows, aes(x = prices, y = days_elapsed, color = cluster,
text = paste("Teatro:", theater_name, "<br>Precio Medio:", prices, "<br>Days Elapsed:", days_elapsed))) +
geom_point() +
stat_ellipse(type = "norm", level = 0.95, linetype = "dashed") +
labs(title = "Clustering de Teatros",
x = "Precio Medio",
y = "Days elapsed") +
scale_color_manual(values = c("blue", "green", "purple"))
grafico_interactivo <- ggplotly(grafico, tooltip = "text") %>%
plotly::layout(hovermode = "closest")
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
grafico_interactivo
We see that we could group the 27 theaters offering shows on Tomaticket into three different groups, taking into account their price and the number of days they are on the billboard.
Group 1, formed by those theaters whose offered shows have a relatively high average price and are also available on the billboard for more than 350 days.
Group 2, formed by theaters with the lowest prices and whose shows, on average, are available on the billboard for less than 300 days.
And finally, Group 3, which consists of theaters where the availability of shows is less than 300 days, but their prices are relatively high.
data_madrid <- data_madrid %>%
mutate(Longitud = as.numeric(Longitud),
Latitud = as.numeric(Latitud)) %>%
na.omit()
map <- leaflet(data_madrid) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
setView(lng = median(data_madrid$Longitud), lat = median(data_madrid$Latitud), zoom = 9)
color_pal <- colorFactor(
palette = c( "green", "red"),
domain = data_madrid$prices
)
map <- map %>%
addCircles(lng = ~Longitud,
lat = ~Latitud,
color = ~color_pal(prices),
popup = ~paste("Teatro:",theater_name, "<br>Precio:" ,prices))
map <- map %>%
addLegend(position = "bottomleft", pal = color_pal, values = ~prices, bins = 4)
map
Regarding the location of theaters in Madrid whose shows are offered on Tomaticket, the most notable aspect is that, at least for those that Tomaticket provides location information for, all the theaters are located in the city center.
If we consider the relationship between location and price, we do not observe any notable pattern.